| import pandas as pd
|
| import numpy as np
|
| import os
|
|
|
|
|
|
|
|
|
|
|
| train_df = pd.read_csv("data/processed/train.csv")
|
| test_df = pd.read_csv("data/processed/test.csv")
|
|
|
| print("Train columns:", train_df.columns.tolist())
|
|
|
|
|
|
|
|
|
| def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| """
|
| Rename pivoted time-step columns so they start from 1 and increment
|
| by 1, regardless of original day_idx values.
|
|
|
| Input columns: ["id", t1, t2, ..., tN] (t's are ints)
|
| Output cols: ["id", 1, 2, ..., N]
|
| """
|
| other_cols = [c for c in df.columns if c != "id"]
|
| if not other_cols:
|
| return df
|
|
|
| min_val = min(other_cols)
|
| new_cols = [int(c - min_val + 1) for c in other_cols]
|
|
|
| df.columns = ["id"] + new_cols
|
| return df
|
|
|
|
|
|
|
|
|
|
|
|
|
| lgbm_val_length = 14
|
|
|
|
|
| _first_id = train_df["id"].iloc[0]
|
| _series_len_train = train_df[train_df["id"] == _first_id].shape[0]
|
|
|
|
|
| assert (
|
| train_df.groupby("id").size().nunique() == 1
|
| ), "All ids in train_df must have same length."
|
|
|
| lgbm_train_length = _series_len_train - lgbm_val_length
|
| print(f"Detected train length per id: {_series_len_train}")
|
| print(f"LGBM train length: {lgbm_train_length}, val length: {lgbm_val_length}")
|
|
|
|
|
|
|
|
|
| train_dfs = []
|
| target_dfs = []
|
|
|
| for unique_id in train_df["id"].unique():
|
| subset_df = train_df[train_df["id"] == unique_id].copy()
|
|
|
|
|
| drop_cols = [
|
| "product_id",
|
| "category_1",
|
| "category_2",
|
| "category_3",
|
| "store_id",
|
| "city_id",
|
| "management_group_id",
|
| "sale_hours",
|
| "sale_hour_ratio",
|
| "stock_hour6_22_cnt",
|
| "stockout_hours",
|
| "stockout_hour_ratio",
|
| "avail_hour_ratio",
|
| "discount",
|
| "holiday",
|
| "activity_flag",
|
| "precip",
|
| "temp",
|
| "humidity",
|
| "wind_level",
|
| "mean",
|
| "std",
|
| "T",
|
| "N",
|
| "ADI",
|
| "CV2",
|
| "ADI_class",
|
| "CV2_class",
|
| "regime",
|
| ]
|
| drop_cols = [c for c in drop_cols if c in subset_df.columns]
|
| subset_df = subset_df.drop(columns=drop_cols)
|
|
|
|
|
|
|
| subset_df = subset_df.sort_values("day_idx")
|
|
|
|
|
| for start in range(0, len(subset_df), lgbm_train_length + lgbm_val_length):
|
| train_slice = subset_df.iloc[start : start + lgbm_train_length]
|
| test_slice = subset_df.iloc[
|
| start + lgbm_train_length : start + lgbm_train_length + lgbm_val_length
|
| ]
|
|
|
|
|
| if len(train_slice) < lgbm_train_length or len(test_slice) < lgbm_val_length:
|
| continue
|
|
|
|
|
| train_wide = (
|
| train_slice.pivot(index="id", columns="day_idx", values="sales")
|
| .reset_index()
|
| )
|
| test_wide = (
|
| test_slice.pivot(index="id", columns="day_idx", values="sales")
|
| .reset_index()
|
| )
|
|
|
| train_wide = rename_columns(train_wide)
|
| test_wide = rename_columns(test_wide)
|
|
|
| train_dfs.append(train_wide)
|
| target_dfs.append(test_wide)
|
|
|
|
|
| if not train_dfs or not target_dfs:
|
| raise RuntimeError("No valid LGBM windows were created from train_df.")
|
|
|
| train_lgbm = pd.concat(train_dfs, ignore_index=True)
|
| target_lgbm = pd.concat(target_dfs, ignore_index=True)
|
|
|
|
|
| os.makedirs("data/processed/lgbm_ready", exist_ok=True)
|
|
|
|
|
| train_lgbm.to_csv("data/processed/lgbm_ready/train.csv", index=False)
|
| target_lgbm.to_csv("data/processed/lgbm_ready/target.csv", index=False)
|
|
|
| print("Saved LGBM train/target to data/processed/lgbm_ready/")
|
|
|
|
|
|
|
|
|
| train_df = pd.read_csv("data/processed/train.csv")
|
| test_df = pd.read_csv("data/processed/test.csv")
|
|
|
| inference_dfs = []
|
| validation_dfs = []
|
|
|
| for unique_id in train_df["id"].unique():
|
| subset_train = train_df[train_df["id"] == unique_id].copy()
|
| subset_train = subset_train.sort_values("day_idx")
|
|
|
|
|
| start_idx = subset_train.shape[0] - lgbm_train_length
|
| inference_slice = subset_train.iloc[start_idx : start_idx + lgbm_train_length]
|
|
|
| inference_wide = (
|
| inference_slice.pivot(index="id", columns="day_idx", values="sales")
|
| .reset_index()
|
| )
|
| inference_wide = rename_columns(inference_wide)
|
|
|
|
|
| subset_test = test_df[test_df["id"] == unique_id].copy()
|
| subset_test = subset_test.sort_values("day_idx")
|
|
|
| start_val = subset_test.shape[0] - lgbm_val_length
|
| validation_slice = subset_test.iloc[start_val : start_val + lgbm_val_length]
|
|
|
| validation_wide = (
|
| validation_slice.pivot(index="id", columns="day_idx", values="sales")
|
| .reset_index()
|
| )
|
| validation_wide = rename_columns(validation_wide)
|
|
|
| inference_dfs.append(inference_wide)
|
| validation_dfs.append(validation_wide)
|
|
|
| inference_df = pd.concat(inference_dfs, ignore_index=True)
|
| validation_df = pd.concat(validation_dfs, ignore_index=True)
|
|
|
| os.makedirs("data/processed/lgbm_ready/inference", exist_ok=True)
|
|
|
| inference_df.to_csv(
|
| "data/processed/lgbm_ready/inference/inference_train.csv",
|
| index=False,
|
| )
|
| validation_df.to_csv(
|
| "data/processed/lgbm_ready/inference/inference_target.csv",
|
| index=False,
|
| )
|
|
|
| print("Saved LGBM inference train/target to data/processed/lgbm_ready/inference/")
|
|
|