portguard-api / src /features.py
NickPatel17's picture
Upload 15 files
4856467 verified
import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from src.config import (
TARGET_MAP, NUMERICAL_COLS, CATEGORICAL_COLS, RAW_DROP_COLS, SMOOTH_M,
)
def _clean(train_df, test_df):
train_port_medians = train_df.groupby("Destination_Port")[NUMERICAL_COLS].median()
train_global_medians = train_df[NUMERICAL_COLS].median()
for df in (train_df, test_df):
for col in NUMERICAL_COLS:
if df[col].isna().any():
port_med = df["Destination_Port"].map(train_port_medians[col])
df[col] = df[col].fillna(port_med).fillna(train_global_medians[col])
for df in (train_df, test_df):
for col in CATEGORICAL_COLS:
if df[col].isna().any():
df[col] = df[col].fillna("UNKNOWN")
for df in (train_df, test_df):
df["Declaration_Date (YYYY-MM-DD)"] = pd.to_datetime(
df["Declaration_Date (YYYY-MM-DD)"], errors="coerce"
)
df["Declaration_DayOfWeek"] = df["Declaration_Date (YYYY-MM-DD)"].dt.dayofweek
time_parsed = pd.to_datetime(
df["Declaration_Time"], format="%H:%M:%S", errors="coerce"
)
df["Declaration_Hour"] = time_parsed.dt.hour
return train_df, test_df
def _engineer_discrepancy(df):
df["Log_Declared_Value"] = np.log1p(df["Declared_Value"])
safe_weight = df["Measured_Weight"].replace(0, np.nan).fillna(1)
df["Log_Value_to_Weight_Ratio"] = np.log1p(df["Declared_Value"] / safe_weight)
df["Weight_Diff_Ratio"] = (
(df["Measured_Weight"] - df["Declared_Weight"]) / (df["Declared_Weight"] + 1)
)
return df
def _engineer_behavioural(train_df, test_df):
for df in (train_df, test_df):
df["Route_ID"] = df["Origin_Country"].str.cat(
df["Destination_Country"], sep="_"
)
for df in (train_df, test_df):
df["HS_Category"] = (df["HS_Code"].astype(str).str[:2]).astype(int)
importer_freq = train_df["Importer_ID"].value_counts()
global_importer_median = int(importer_freq.median())
for df in (train_df, test_df):
df["Importer_Freq_Count"] = (
df["Importer_ID"]
.map(importer_freq)
.fillna(global_importer_median)
.astype(int)
)
route_freq = train_df["Route_ID"].value_counts()
rare_routes = set(route_freq[route_freq < 5].index)
for df in (train_df, test_df):
df["Rare_Route_Flag"] = df["Route_ID"].apply(
lambda r: 1 if (r in rare_routes or r not in route_freq.index) else 0
).astype(np.int8)
line_avg_dwell = train_df.groupby("Shipping_Line")["Dwell_Time_Hours"].mean()
global_avg_dwell = train_df["Dwell_Time_Hours"].mean()
for df in (train_df, test_df):
df["Shipping_Line_Avg_Dwell"] = (
df["Shipping_Line"].map(line_avg_dwell).fillna(global_avg_dwell)
)
for df in (train_df, test_df):
df["Dwell_Time_Deviation"] = (
df["Dwell_Time_Hours"] / (df["Shipping_Line_Avg_Dwell"] + 1)
)
return train_df, test_df
def _engineer_smoothed_target_encoding(train_df, test_df):
train_df["Target"] = train_df["Clearance_Status"].map(TARGET_MAP)
train_df["Is_Risky"] = (train_df["Target"] >= 1).astype(np.int8)
global_risk_mean = train_df["Is_Risky"].mean()
def _smoothed(group_col, feat_name):
stats = train_df.groupby(group_col)["Is_Risky"].agg(["mean", "count"])
smoothed = (
(stats["count"] * stats["mean"] + SMOOTH_M * global_risk_mean)
/ (stats["count"] + SMOOTH_M)
)
for df in (train_df, test_df):
df[feat_name] = df[group_col].map(smoothed).fillna(global_risk_mean)
_smoothed("Importer_ID", "Importer_Risk_Index")
_smoothed("HS_Category", "HS_Risk_Index")
return train_df, test_df
def _engineer_recovered_features(train_df, test_df):
trade_col = "Trade_Regime (Import / Export / Transit)"
y_binary = train_df["Is_Risky"]
trade_train = pd.get_dummies(
train_df[[trade_col]], prefix="Trade", dtype=np.int8
)
trade_test = pd.get_dummies(
test_df[[trade_col]], prefix="Trade", dtype=np.int8
)
trade_test = trade_test.reindex(columns=trade_train.columns, fill_value=0)
train_df = pd.concat([train_df, trade_train], axis=1)
test_df = pd.concat([test_df, trade_test], axis=1)
print(f" Trade Regime dummies: {trade_train.columns.tolist()}")
origin_enc = TargetEncoder(cols=["Origin_Country"], smoothing=10)
origin_train = origin_enc.fit_transform(
train_df[["Origin_Country"]], y_binary
).rename(columns={"Origin_Country": "Origin_Country_Risk"})
origin_test = origin_enc.transform(
test_df[["Origin_Country"]]
).rename(columns={"Origin_Country": "Origin_Country_Risk"})
train_df["Origin_Country_Risk"] = origin_train["Origin_Country_Risk"].values
test_df["Origin_Country_Risk"] = origin_test["Origin_Country_Risk"].values
exporter_enc = TargetEncoder(cols=["Exporter_ID"], smoothing=10)
exporter_train = exporter_enc.fit_transform(
train_df[["Exporter_ID"]], y_binary
).rename(columns={"Exporter_ID": "Exporter_Risk"})
exporter_test = exporter_enc.transform(
test_df[["Exporter_ID"]]
).rename(columns={"Exporter_ID": "Exporter_Risk"})
train_df["Exporter_Risk"] = exporter_train["Exporter_Risk"].values
test_df["Exporter_Risk"] = exporter_test["Exporter_Risk"].values
print(f" Origin_Country_Risk — train mean: "
f"{train_df['Origin_Country_Risk'].mean():.4f}")
print(f" Exporter_Risk — train mean: "
f"{train_df['Exporter_Risk'].mean():.4f}")
for df in (train_df, test_df):
df.drop(
columns=[trade_col, "Origin_Country", "Exporter_ID"],
inplace=True,
)
return train_df, test_df
def preprocess_and_engineer(train_df, test_df):
print("[Features] Cleaning...")
train_df, test_df = _clean(train_df, test_df)
print("[Features] Discrepancy features...")
train_df = _engineer_discrepancy(train_df)
test_df = _engineer_discrepancy(test_df)
print("[Features] Behavioural features...")
train_df, test_df = _engineer_behavioural(train_df, test_df)
print("[Features] Smoothed target encoding (Importer, HS)...")
train_df, test_df = _engineer_smoothed_target_encoding(train_df, test_df)
print("[Features] Recovered features (Trade, Origin, Exporter)...")
train_df, test_df = _engineer_recovered_features(train_df, test_df)
train_ids = train_df["Container_ID"].copy()
test_ids = test_df["Container_ID"].copy()
y_train = train_df["Target"].copy()
cols_to_drop = [c for c in RAW_DROP_COLS if c in train_df.columns]
train_df.drop(columns=cols_to_drop + ["Container_ID", "Target"], inplace=True)
test_cols_to_drop = [c for c in RAW_DROP_COLS if c in test_df.columns]
test_df.drop(
columns=test_cols_to_drop + ["Container_ID"],
inplace=True, errors="ignore",
)
test_df.drop(columns=["Target"], inplace=True, errors="ignore")
common_cols = sorted(set(train_df.columns) & set(test_df.columns))
X_train = train_df[common_cols].copy()
X_test = test_df[common_cols].copy()
print(f"[Features] Done — X_train {X_train.shape} X_test {X_test.shape}")
print(f" Columns: {X_train.columns.tolist()}")
return X_train, X_test, y_train, train_ids, test_ids