| import numpy as np |
| import pandas as pd |
| import pygeohash as pgh |
| from lightgbm import LGBMRegressor |
| from xgboost import XGBRegressor |
| from sklearn.metrics import r2_score |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| DATA = 'e88186124ec611f1/dataset' |
|
|
| print("Loading data...") |
| train = pd.read_csv(f'{DATA}/train.csv') |
| test = pd.read_csv(f'{DATA}/test.csv') |
|
|
| |
| def parse_ts(df): |
| df = df.copy() |
| df['hour'] = df['timestamp'].map(lambda x: int(x.split(':')[0])) |
| df['minute'] = df['timestamp'].map(lambda x: int(x.split(':')[1])) |
| t = (df['hour'] * 60 + df['minute']) / (24 * 60) * 2 * np.pi |
| df['time_sin'] = np.sin(t) |
| df['time_cos'] = np.cos(t) |
| d = df['day'] / 7 * 2 * np.pi |
| df['day_sin'] = np.sin(d) |
| df['day_cos'] = np.cos(d) |
| return df |
|
|
| def decode_geo(df): |
| df = df.copy() |
| decoded = df['geohash'].map(pgh.decode) |
| df['lat'] = decoded.map(lambda x: x[0]) |
| df['lon'] = decoded.map(lambda x: x[1]) |
| df['geo3'] = df['geohash'].str[:3] |
| return df |
|
|
| def encode_cats(df): |
| df = df.copy() |
| df['RoadType_enc'] = df['RoadType'].map({'Residential': 0, 'Street': 1, 'Highway': 2}).fillna(-1) |
| df['LargeVehicles_enc'] = (df['LargeVehicles'] == 'Allowed').astype(float) |
| df['Landmarks_enc'] = (df['Landmarks'] == 'Yes').astype(float) |
| df['Weather_enc'] = df['Weather'].map({'Sunny': 0, 'Rainy': 1, 'Foggy': 2, 'Snowy': 3}).fillna(-1) |
| return df |
|
|
| train = parse_ts(train); train = decode_geo(train); train = encode_cats(train) |
| test = parse_ts(test); test = decode_geo(test); test = encode_cats(test) |
|
|
| |
| print("Building neighbor cache...") |
| all_geohashes = list(set(train['geohash']) | set(test['geohash'])) |
| neighbor_cache = {} |
| for gh in all_geohashes: |
| t = pgh.get_adjacent(gh, 'top'); b = pgh.get_adjacent(gh, 'bottom') |
| l = pgh.get_adjacent(gh, 'left'); r = pgh.get_adjacent(gh, 'right') |
| tl = pgh.get_adjacent(t, 'left'); tr = pgh.get_adjacent(t, 'right') |
| bl = pgh.get_adjacent(b, 'left'); br = pgh.get_adjacent(b, 'right') |
| neighbor_cache[gh] = [t, b, l, r, tl, tr, bl, br] |
|
|
| def ts_offset(ts, delta_min): |
| h, m = int(ts.split(':')[0]), int(ts.split(':')[1]) |
| total = (h * 60 + m + delta_min) % (24 * 60) |
| return f"{total // 60}:{total % 60}" |
|
|
| |
| def build_features(df, ref_df): |
| """ |
| df : dataframe to featurize |
| ref_df : training subset used to compute all statistics (no leakage) |
| """ |
| df = df.copy() |
|
|
| |
| lag_lookup = dict(zip(zip(ref_df['geohash'], ref_df['timestamp']), ref_df['demand'])) |
|
|
| |
| for delta in [0, 15, 30, 45, 60]: |
| col = 'lag1d' if delta == 0 else f'lag1d_m{delta}' |
| if delta == 0: |
| df[col] = [lag_lookup.get((gh, ts), np.nan) |
| for gh, ts in zip(df['geohash'], df['timestamp'])] |
| else: |
| df[col] = [lag_lookup.get((gh, ts_offset(ts, -delta)), np.nan) |
| for gh, ts in zip(df['geohash'], df['timestamp'])] |
|
|
| |
| df['neighbor_mean'] = [ |
| np.nanmean([lag_lookup.get((n, ts), np.nan) |
| for n in neighbor_cache.get(gh, [])]) or np.nan |
| for gh, ts in zip(df['geohash'], df['timestamp']) |
| ] |
|
|
| |
| geo_stats = (ref_df.groupby('geohash')['demand'] |
| .agg(['mean','std','median','max']).reset_index() |
| .rename(columns={'mean':'geo_mean','std':'geo_std', |
| 'median':'geo_median','max':'geo_max'})) |
| geo_stats['geo_std'] = geo_stats['geo_std'].fillna(0) |
| geo_hour = (ref_df.groupby(['geohash','hour'])['demand'] |
| .mean().reset_index().rename(columns={'demand':'geo_hour_mean'})) |
| geo_ts = (ref_df.groupby(['geohash','timestamp'])['demand'] |
| .mean().reset_index().rename(columns={'demand':'geo_ts_mean'})) |
| geo3_h = (ref_df.groupby(['geo3','hour'])['demand'] |
| .mean().reset_index().rename(columns={'demand':'geo3_hour_mean'})) |
| geo3_m = (ref_df.groupby('geo3')['demand'] |
| .mean().reset_index().rename(columns={'demand':'geo3_mean'})) |
| hr_mean = (ref_df.groupby('hour')['demand'] |
| .mean().reset_index().rename(columns={'demand':'hour_global_mean'})) |
| |
| day49_early = ref_df[ref_df['day'] == 49] if 'day' in ref_df.columns else ref_df.iloc[0:0] |
| if len(day49_early): |
| d49_base = (day49_early.groupby('geohash')['demand'] |
| .mean().reset_index().rename(columns={'demand':'geo_d49_mean'})) |
| else: |
| d49_base = pd.DataFrame({'geohash': [], 'geo_d49_mean': []}) |
|
|
| df = df.merge(geo_stats, on='geohash', how='left') |
| df = df.merge(geo_hour, on=['geohash','hour'], how='left') |
| df = df.merge(geo_ts, on=['geohash','timestamp'], how='left') |
| df = df.merge(geo3_h, on=['geo3','hour'], how='left') |
| df = df.merge(geo3_m, on='geo3', how='left') |
| df = df.merge(hr_mean, on='hour', how='left') |
| df = df.merge(d49_base, on='geohash', how='left') |
|
|
| |
| |
| day48_am = (ref_df[ref_df['day'] == 48][ref_df['hour'] < 4] if len(ref_df[ref_df['day'] == 48]) else ref_df.iloc[0:0]) |
| day49_am = (day49_early[day49_early['hour'] < 4] if len(day49_early) else pd.DataFrame()) |
|
|
| if len(day48_am) and len(day49_am): |
| d48_am_mean = day48_am.groupby('geohash')['demand'].mean().rename('d48_am') |
| d49_am_mean = day49_am.groupby('geohash')['demand'].mean().rename('d49_am') |
| ratio_df = pd.concat([d48_am_mean, d49_am_mean], axis=1).reset_index() |
| ratio_df['daily_ratio'] = ratio_df['d49_am'] / ratio_df['d48_am'].replace(0, np.nan) |
| ratio_df = ratio_df[['geohash', 'daily_ratio']] |
| df = df.merge(ratio_df, on='geohash', how='left') |
| else: |
| df['daily_ratio'] = np.nan |
|
|
| |
| fallback = df['neighbor_mean'].fillna(df['geo_ts_mean']).fillna(df['geo_hour_mean']) |
| for col in ['lag1d','lag1d_m15','lag1d_m30','lag1d_m45','lag1d_m60']: |
| df[col] = df[col].fillna(fallback) |
|
|
| return df |
|
|
| FEATURES = [ |
| 'lat', 'lon', 'hour', 'minute', 'day', |
| 'time_sin', 'time_cos', 'day_sin', 'day_cos', |
| 'RoadType_enc', 'NumberofLanes', 'LargeVehicles_enc', 'Landmarks_enc', |
| 'Temperature', 'Weather_enc', |
| 'lag1d', 'lag1d_m15', 'lag1d_m30', 'lag1d_m45', 'lag1d_m60', |
| 'neighbor_mean', |
| 'geo_mean', 'geo_std', 'geo_median', 'geo_max', |
| 'geo_hour_mean', 'geo_ts_mean', 'geo3_hour_mean', 'geo3_mean', |
| 'hour_global_mean', 'geo_d49_mean', 'daily_ratio', |
| ] |
|
|
| LGBM_PARAMS = dict( |
| n_estimators=3000, learning_rate=0.02, num_leaves=255, |
| min_child_samples=15, subsample=0.8, subsample_freq=1, |
| colsample_bytree=0.8, reg_alpha=0.05, reg_lambda=0.1, |
| random_state=42, verbose=-1, n_jobs=-1, |
| ) |
| XGB_PARAMS = dict( |
| n_estimators=3000, learning_rate=0.02, max_depth=8, |
| subsample=0.8, colsample_bytree=0.8, |
| reg_alpha=0.05, reg_lambda=0.1, |
| random_state=42, verbosity=0, n_jobs=-1, tree_method='hist', |
| ) |
|
|
| |
| print("\nBuilding CV features (ref=day48 only)...") |
| train48 = train[train['day'] == 48] |
| train49 = train[train['day'] == 49] |
|
|
| tr_cv = build_features(train48, train48) |
| va_cv = build_features(train49, train48) |
|
|
| X_tr = tr_cv[FEATURES].fillna(-1); y_tr = train48['demand'].values |
| X_va = va_cv[FEATURES].fillna(-1); y_va = train49['demand'].values |
|
|
| print(f"CV β train: {X_tr.shape} val: {X_va.shape}") |
|
|
| print("\nTraining LGBM CV...") |
| lgbm_cv = LGBMRegressor(**LGBM_PARAMS) |
| lgbm_cv.fit(X_tr, y_tr) |
| lp = lgbm_cv.predict(X_va) |
| lgbm_r2 = r2_score(y_va, lp) |
| print(f"LGBM CV R2: {lgbm_r2:.4f} score: {max(0, 100*lgbm_r2):.2f}") |
|
|
| print("\nTraining XGB CV...") |
| xgb_cv = XGBRegressor(**XGB_PARAMS) |
| xgb_cv.fit(X_tr, y_tr) |
| xp = xgb_cv.predict(X_va) |
| xgb_r2 = r2_score(y_va, xp) |
| print(f"XGB CV R2: {xgb_r2:.4f} score: {max(0, 100*xgb_r2):.2f}") |
|
|
| best_w, best_r2 = 0, -999 |
| for w in np.arange(0, 1.05, 0.05): |
| r2 = r2_score(y_va, w * lp + (1 - w) * xp) |
| if r2 > best_r2: |
| best_r2, best_w = r2, w |
| print(f"\nBest blend {best_w:.2f}*LGBM + {1-best_w:.2f}*XGB: R2={best_r2:.4f} score={max(0, 100*best_r2):.2f}") |
|
|
| |
| print("\nBuilding final features (ref=all train)...") |
| train_full = build_features(train, train) |
| test_full = build_features(test, train) |
|
|
| X_all = train_full[FEATURES].fillna(-1) |
| y_all = train['demand'].values |
| X_test = test_full[FEATURES].fillna(-1) |
|
|
| print("Training final LGBM...") |
| lgbm_f = LGBMRegressor(**LGBM_PARAMS) |
| lgbm_f.fit(X_all, y_all) |
|
|
| print("Training final XGB...") |
| xgb_f = XGBRegressor(**XGB_PARAMS) |
| xgb_f.fit(X_all, y_all) |
|
|
| preds = np.clip( |
| best_w * lgbm_f.predict(X_test) + (1 - best_w) * xgb_f.predict(X_test), |
| 0, None |
| ) |
|
|
| submission = pd.DataFrame({'Index': test['Index'], 'demand': preds}) |
| submission.to_csv('submission.csv', index=False) |
| print(f"\nSaved submission.csv ({len(submission)} rows)") |
| print(submission.head()) |
|
|
| fi = pd.Series(lgbm_f.feature_importances_, index=FEATURES).sort_values(ascending=False) |
| print("\nTop feature importances:") |
| print(fi.head(15)) |
|
|