File size: 10,234 Bytes

import numpy as np
import pandas as pd
import pygeohash as pgh
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

DATA = 'e88186124ec611f1/dataset'

print("Loading data...")
train = pd.read_csv(f'{DATA}/train.csv')
test  = pd.read_csv(f'{DATA}/test.csv')

# ── Parse / encode ────────────────────────────────────────────────────────────
def parse_ts(df):
    df = df.copy()
    df['hour']   = df['timestamp'].map(lambda x: int(x.split(':')[0]))
    df['minute'] = df['timestamp'].map(lambda x: int(x.split(':')[1]))
    t = (df['hour'] * 60 + df['minute']) / (24 * 60) * 2 * np.pi
    df['time_sin'] = np.sin(t)
    df['time_cos'] = np.cos(t)
    d = df['day'] / 7 * 2 * np.pi
    df['day_sin'] = np.sin(d)
    df['day_cos'] = np.cos(d)
    return df

def decode_geo(df):
    df = df.copy()
    decoded  = df['geohash'].map(pgh.decode)
    df['lat'] = decoded.map(lambda x: x[0])
    df['lon'] = decoded.map(lambda x: x[1])
    df['geo3'] = df['geohash'].str[:3]
    return df

def encode_cats(df):
    df = df.copy()
    df['RoadType_enc']      = df['RoadType'].map({'Residential': 0, 'Street': 1, 'Highway': 2}).fillna(-1)
    df['LargeVehicles_enc'] = (df['LargeVehicles'] == 'Allowed').astype(float)
    df['Landmarks_enc']     = (df['Landmarks'] == 'Yes').astype(float)
    df['Weather_enc']       = df['Weather'].map({'Sunny': 0, 'Rainy': 1, 'Foggy': 2, 'Snowy': 3}).fillna(-1)
    return df

train = parse_ts(train); train = decode_geo(train); train = encode_cats(train)
test  = parse_ts(test);  test  = decode_geo(test);  test  = encode_cats(test)

# ── Neighbor cache ────────────────────────────────────────────────────────────
print("Building neighbor cache...")
all_geohashes = list(set(train['geohash']) | set(test['geohash']))
neighbor_cache = {}
for gh in all_geohashes:
    t = pgh.get_adjacent(gh, 'top');    b = pgh.get_adjacent(gh, 'bottom')
    l = pgh.get_adjacent(gh, 'left');   r = pgh.get_adjacent(gh, 'right')
    tl = pgh.get_adjacent(t, 'left');   tr = pgh.get_adjacent(t, 'right')
    bl = pgh.get_adjacent(b, 'left');   br = pgh.get_adjacent(b, 'right')
    neighbor_cache[gh] = [t, b, l, r, tl, tr, bl, br]

def ts_offset(ts, delta_min):
    h, m = int(ts.split(':')[0]), int(ts.split(':')[1])
    total = (h * 60 + m + delta_min) % (24 * 60)
    return f"{total // 60}:{total % 60}"

# ── Core feature builder (given a reference day's lookup dict) ───────────────
def build_features(df, ref_df):
    """
    df      : dataframe to featurize
    ref_df  : training subset used to compute all statistics (no leakage)
    """
    df = df.copy()

    # Lag lookup from ref_df (day 48 in CV, all train for final)
    lag_lookup = dict(zip(zip(ref_df['geohash'], ref_df['timestamp']), ref_df['demand']))

    # Rolling lags: T, T-15, T-30, T-45, T-60
    for delta in [0, 15, 30, 45, 60]:
        col = 'lag1d' if delta == 0 else f'lag1d_m{delta}'
        if delta == 0:
            df[col] = [lag_lookup.get((gh, ts), np.nan)
                       for gh, ts in zip(df['geohash'], df['timestamp'])]
        else:
            df[col] = [lag_lookup.get((gh, ts_offset(ts, -delta)), np.nan)
                       for gh, ts in zip(df['geohash'], df['timestamp'])]

    # Neighbor mean at same timestamp from ref_df
    df['neighbor_mean'] = [
        np.nanmean([lag_lookup.get((n, ts), np.nan)
                    for n in neighbor_cache.get(gh, [])]) or np.nan
        for gh, ts in zip(df['geohash'], df['timestamp'])
    ]

    # Aggregations from ref_df
    geo_stats = (ref_df.groupby('geohash')['demand']
                 .agg(['mean','std','median','max']).reset_index()
                 .rename(columns={'mean':'geo_mean','std':'geo_std',
                                  'median':'geo_median','max':'geo_max'}))
    geo_stats['geo_std'] = geo_stats['geo_std'].fillna(0)
    geo_hour = (ref_df.groupby(['geohash','hour'])['demand']
                .mean().reset_index().rename(columns={'demand':'geo_hour_mean'}))
    geo_ts   = (ref_df.groupby(['geohash','timestamp'])['demand']
                .mean().reset_index().rename(columns={'demand':'geo_ts_mean'}))
    geo3_h   = (ref_df.groupby(['geo3','hour'])['demand']
                .mean().reset_index().rename(columns={'demand':'geo3_hour_mean'}))
    geo3_m   = (ref_df.groupby('geo3')['demand']
                .mean().reset_index().rename(columns={'demand':'geo3_mean'}))
    hr_mean  = (ref_df.groupby('hour')['demand']
                .mean().reset_index().rename(columns={'demand':'hour_global_mean'}))
    # Day-49 early hours baseline (if available in ref_df)
    day49_early = ref_df[ref_df['day'] == 49] if 'day' in ref_df.columns else ref_df.iloc[0:0]
    if len(day49_early):
        d49_base = (day49_early.groupby('geohash')['demand']
                    .mean().reset_index().rename(columns={'demand':'geo_d49_mean'}))
    else:
        d49_base = pd.DataFrame({'geohash': [], 'geo_d49_mean': []})

    df = df.merge(geo_stats, on='geohash', how='left')
    df = df.merge(geo_hour,  on=['geohash','hour'], how='left')
    df = df.merge(geo_ts,    on=['geohash','timestamp'], how='left')
    df = df.merge(geo3_h,    on=['geo3','hour'], how='left')
    df = df.merge(geo3_m,    on='geo3', how='left')
    df = df.merge(hr_mean,   on='hour', how='left')
    df = df.merge(d49_base,  on='geohash', how='left')

    # Daily ratio: day49_morning / day48_morning per geohash
    # Signals if today is busier/quieter than yesterday overall
    day48_am = (ref_df[ref_df['day'] == 48][ref_df['hour'] < 4] if len(ref_df[ref_df['day'] == 48]) else ref_df.iloc[0:0])
    day49_am = (day49_early[day49_early['hour'] < 4] if len(day49_early) else pd.DataFrame())

    if len(day48_am) and len(day49_am):
        d48_am_mean = day48_am.groupby('geohash')['demand'].mean().rename('d48_am')
        d49_am_mean = day49_am.groupby('geohash')['demand'].mean().rename('d49_am')
        ratio_df = pd.concat([d48_am_mean, d49_am_mean], axis=1).reset_index()
        ratio_df['daily_ratio'] = ratio_df['d49_am'] / ratio_df['d48_am'].replace(0, np.nan)
        ratio_df = ratio_df[['geohash', 'daily_ratio']]
        df = df.merge(ratio_df, on='geohash', how='left')
    else:
        df['daily_ratio'] = np.nan

    # Impute missing lags with fallback chain
    fallback = df['neighbor_mean'].fillna(df['geo_ts_mean']).fillna(df['geo_hour_mean'])
    for col in ['lag1d','lag1d_m15','lag1d_m30','lag1d_m45','lag1d_m60']:
        df[col] = df[col].fillna(fallback)

    return df

FEATURES = [
    'lat', 'lon', 'hour', 'minute', 'day',
    'time_sin', 'time_cos', 'day_sin', 'day_cos',
    'RoadType_enc', 'NumberofLanes', 'LargeVehicles_enc', 'Landmarks_enc',
    'Temperature', 'Weather_enc',
    'lag1d', 'lag1d_m15', 'lag1d_m30', 'lag1d_m45', 'lag1d_m60',
    'neighbor_mean',
    'geo_mean', 'geo_std', 'geo_median', 'geo_max',
    'geo_hour_mean', 'geo_ts_mean', 'geo3_hour_mean', 'geo3_mean',
    'hour_global_mean', 'geo_d49_mean', 'daily_ratio',
]

LGBM_PARAMS = dict(
    n_estimators=3000, learning_rate=0.02, num_leaves=255,
    min_child_samples=15, subsample=0.8, subsample_freq=1,
    colsample_bytree=0.8, reg_alpha=0.05, reg_lambda=0.1,
    random_state=42, verbose=-1, n_jobs=-1,
)
XGB_PARAMS = dict(
    n_estimators=3000, learning_rate=0.02, max_depth=8,
    subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.05, reg_lambda=0.1,
    random_state=42, verbosity=0, n_jobs=-1, tree_method='hist',
)

# ── Proper CV: ref = day48 only ───────────────────────────────────────────────
print("\nBuilding CV features (ref=day48 only)...")
train48 = train[train['day'] == 48]
train49 = train[train['day'] == 49]

tr_cv = build_features(train48, train48)   # train on day48, featurized vs day48
va_cv = build_features(train49, train48)   # val on day49, but stats from day48 only

X_tr = tr_cv[FEATURES].fillna(-1);  y_tr = train48['demand'].values
X_va = va_cv[FEATURES].fillna(-1);  y_va = train49['demand'].values

print(f"CV — train: {X_tr.shape}  val: {X_va.shape}")

print("\nTraining LGBM CV...")
lgbm_cv = LGBMRegressor(**LGBM_PARAMS)
lgbm_cv.fit(X_tr, y_tr)
lp = lgbm_cv.predict(X_va)
lgbm_r2 = r2_score(y_va, lp)
print(f"LGBM CV R2: {lgbm_r2:.4f}  score: {max(0, 100*lgbm_r2):.2f}")

print("\nTraining XGB CV...")
xgb_cv = XGBRegressor(**XGB_PARAMS)
xgb_cv.fit(X_tr, y_tr)
xp = xgb_cv.predict(X_va)
xgb_r2 = r2_score(y_va, xp)
print(f"XGB CV R2:  {xgb_r2:.4f}  score: {max(0, 100*xgb_r2):.2f}")

best_w, best_r2 = 0, -999
for w in np.arange(0, 1.05, 0.05):
    r2 = r2_score(y_va, w * lp + (1 - w) * xp)
    if r2 > best_r2:
        best_r2, best_w = r2, w
print(f"\nBest blend {best_w:.2f}*LGBM + {1-best_w:.2f}*XGB: R2={best_r2:.4f}  score={max(0, 100*best_r2):.2f}")

# ── Final model: ref = ALL train ──────────────────────────────────────────────
print("\nBuilding final features (ref=all train)...")
train_full = build_features(train, train)
test_full  = build_features(test,  train)

X_all  = train_full[FEATURES].fillna(-1)
y_all  = train['demand'].values
X_test = test_full[FEATURES].fillna(-1)

print("Training final LGBM...")
lgbm_f = LGBMRegressor(**LGBM_PARAMS)
lgbm_f.fit(X_all, y_all)

print("Training final XGB...")
xgb_f = XGBRegressor(**XGB_PARAMS)
xgb_f.fit(X_all, y_all)

preds = np.clip(
    best_w * lgbm_f.predict(X_test) + (1 - best_w) * xgb_f.predict(X_test),
    0, None
)

submission = pd.DataFrame({'Index': test['Index'], 'demand': preds})
submission.to_csv('submission.csv', index=False)
print(f"\nSaved submission.csv  ({len(submission)} rows)")
print(submission.head())

fi = pd.Series(lgbm_f.feature_importances_, index=FEATURES).sort_values(ascending=False)
print("\nTop feature importances:")
print(fi.head(15))