Spaces:
Running
Running
File size: 6,541 Bytes
99bc19c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | """
Feature engineering β the analytical core of this project.
Raw Sparkov transactions are turned into signals a fraud model can learn from.
Every per-card feature is computed in strict time order and looks **only at the
past** (closed='left' rolling windows, shifted expanding stats). This prevents
target leakage: at scoring time you never know the current/future transactions.
Feature families
----------------
1. Transaction β amount, log-amount
2. Temporal β hour, day-of-week, night flag, weekend flag
3. Demographic β cardholder age, city population
4. Geo β haversine distance homeβmerchant, and from previous txn
5. Velocity β rolling count / sum / mean of txns per card (1h/24h/7d)
6. Behavioral β deviation of amount from the card's own past average,
time since previous txn, distinct merchants in 24h
The velocity + behavioral families are what catch real fraud: a stolen card
shows a burst of transactions, in new locations, deviating from normal spend.
"""
from __future__ import annotations
import numpy as np
import pandas as pd
from src import config
EARTH_RADIUS_KM = 6371.0088
# ββ Geo βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def haversine_km(lat1, lon1, lat2, lon2):
"""Vectorised great-circle distance in kilometres."""
lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
return 2 * EARTH_RADIUS_KM * np.arcsin(np.sqrt(np.clip(a, 0, 1)))
# ββ Feature builders (each returns the df with new columns) βββββββββββββββββ
def _add_temporal(df: pd.DataFrame) -> pd.DataFrame:
t = df[config.TIME_COL].dt
df["hour"] = t.hour
df["day_of_week"] = t.dayofweek
df["is_night"] = ((t.hour < 6) | (t.hour >= 22)).astype("int8")
df["is_weekend"] = (t.dayofweek >= 5).astype("int8")
return df
def _add_demographic(df: pd.DataFrame) -> pd.DataFrame:
# Age at transaction time (years)
age = (df[config.TIME_COL] - df["dob"]).dt.days / 365.25
df["age"] = age.clip(lower=0, upper=120)
df["city_pop_log"] = np.log1p(df["city_pop"].clip(lower=0))
return df
def _add_amount(df: pd.DataFrame) -> pd.DataFrame:
df["amt_log"] = np.log1p(df["amt"].clip(lower=0))
return df
def _add_geo(df: pd.DataFrame) -> pd.DataFrame:
# Distance between cardholder home and merchant location
df["dist_home_merchant_km"] = haversine_km(
df["lat"], df["long"], df["merch_lat"], df["merch_long"]
)
# Distance from the card's previous transaction (movement speed proxy)
df = df.sort_values([config.CARD_COL, config.TIME_COL])
prev_lat = df.groupby(config.CARD_COL)["merch_lat"].shift(1)
prev_lon = df.groupby(config.CARD_COL)["merch_long"].shift(1)
dist_prev = haversine_km(df["merch_lat"], df["merch_long"], prev_lat, prev_lon)
df["dist_from_prev_txn_km"] = dist_prev.fillna(0.0)
return df
def _add_velocity(df: pd.DataFrame) -> pd.DataFrame:
"""Rolling per-card counts and sums over 1h / 24h / 7d, past-only."""
df = df.sort_values([config.CARD_COL, config.TIME_COL]).reset_index(drop=True)
for window, suffix in [("1h", "1h"), ("24h", "24h"), ("7d", "7d")]:
roll = df.groupby(config.CARD_COL).rolling(
window, on=config.TIME_COL, closed="left"
)["amt"]
cnt = roll.count().reset_index(level=0, drop=True)
s = roll.sum().reset_index(level=0, drop=True)
df[f"txn_count_{suffix}"] = cnt.fillna(0).astype("float32").values
df[f"amt_sum_{suffix}"] = s.fillna(0).astype("float32").values
# 24h mean amount (past)
df["amt_mean_24h"] = (
df["amt_sum_24h"] / df["txn_count_24h"].replace(0, np.nan)
).fillna(0.0).astype("float32")
# Seconds since previous transaction
secs = df.groupby(config.CARD_COL)[config.TIME_COL].diff().dt.total_seconds()
df["secs_since_prev_txn"] = secs.fillna(-1.0).astype("float32")
return df
def _add_behavioral(df: pd.DataFrame) -> pd.DataFrame:
"""Deviation of the current amount from the card's own past behaviour."""
df = df.sort_values([config.CARD_COL, config.TIME_COL]).reset_index(drop=True)
g = df.groupby(config.CARD_COL)["amt"]
# Past mean via cumulative sums (vectorised, excludes current row)
cumsum_prev = g.cumsum() - df["amt"]
cumcount_prev = g.cumcount() # number of strictly-previous txns
past_mean = cumsum_prev / cumcount_prev.replace(0, np.nan)
past_mean = past_mean.fillna(df["amt"]) # first txn: no history β neutral
df["amt_dev_from_card_mean"] = (df["amt"] - past_mean).astype("float32")
df["amt_ratio_to_card_mean"] = (
df["amt"] / past_mean.replace(0, np.nan)
).fillna(1.0).clip(upper=1000).astype("float32")
# Distinct merchants in the past 24h (rolling unique count)
df["_merch_code"] = df[config.MERCHANT_COL].astype("category").cat.codes
distinct = (
df.groupby(config.CARD_COL)
.rolling("24h", on=config.TIME_COL, closed="left")["_merch_code"]
.apply(lambda s: s.nunique(), raw=False)
.reset_index(level=0, drop=True)
)
df["distinct_merchants_24h"] = distinct.fillna(0).astype("float32").values
df = df.drop(columns=["_merch_code"])
return df
def engineer_features(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
"""
Full feature pipeline. Input: raw Sparkov rows. Output: a frame containing
all engineered features in config.ALL_FEATURES plus identifiers + target.
"""
df = df.copy()
steps = [
("amount", _add_amount),
("temporal", _add_temporal),
("demographic", _add_demographic),
("geo", _add_geo),
("velocity", _add_velocity),
("behavioral", _add_behavioral),
]
for name, fn in steps:
df = fn(df)
if verbose:
print(f"[features] {name} done")
# Restore chronological order (important for downstream temporal split)
df = df.sort_values(config.TIME_COL).reset_index(drop=True)
keep = (
config.ALL_FEATURES
+ [config.TARGET, config.CARD_COL, config.MERCHANT_COL, config.TIME_COL]
)
keep = [c for c in keep if c in df.columns]
return df[keep]
|