| |
| import asyncio |
| from datetime import datetime, timedelta |
|
|
| import pandas as pd |
|
|
| from features import DataPipeline |
| from model import StockNewsModel |
|
|
|
|
| def audit_leakage(X_train, X_test, train_df, test_df): |
| print(f"\n{'=' * 60}") |
| print(" DATA LEAKAGE AUDIT") |
| print(f"{'=' * 60}") |
|
|
| train_max = train_df["date"].max() |
| test_min = test_df["date"].min() |
| ok1 = train_max < test_min |
| print(f" [{'OK' if ok1 else 'FAIL'}] Train max date ({train_max}) < Test min date ({test_min})") |
|
|
| train_links = set(train_df["link"]) |
| test_links = set(test_df["link"]) |
| overlap = train_links & test_links |
| ok2 = len(overlap) == 0 |
| print(f" [{'OK' if ok2 else 'FAIL'}] No shared articles (overlap={len(overlap)})") |
|
|
| future_cols = [c for c in X_train.columns if "next" in c or "future" in c or "target" in c or "label" in c] |
| ok3 = len(future_cols) == 0 |
| print(f" [{'OK' if ok3 else 'FAIL'}] No future-looking feature names (found: {future_cols})") |
|
|
| ok4 = X_train.shape[1] == X_test.shape[1] |
| print(f" [{'OK' if ok4 else 'FAIL'}] Same feature count: train={X_train.shape[1]} test={X_test.shape[1]}") |
|
|
| all_ok = ok1 and ok2 and ok3 and ok4 |
| print(f"\n {'OK ALL CHECKS PASSED' if all_ok else 'FAIL LEAKAGE DETECTED'}") |
| print(f"{'=' * 60}") |
| return all_ok |
|
|
|
|
| async def main(): |
| TICKER = "^NSEI" |
| TRAIN_DAYS = 180 |
| TEST_DAYS = 14 |
|
|
| print(f"\n{'#' * 60}") |
| print(" LightGBM NEWS IMPACT MODEL - TRAINING") |
| print(f" Ticker: {TICKER}") |
| print(f" Train: {TRAIN_DAYS} days | Test: {TEST_DAYS} days") |
| print(f"{'#' * 60}") |
|
|
| pipeline = DataPipeline(TICKER, train_days=TRAIN_DAYS, test_days=TEST_DAYS) |
| X_train, y_dir, y_hh, y_ret, X_test, test_df, price_df = await pipeline.build_dataset() |
|
|
| now = datetime.now() |
| cutoff = (now - timedelta(days=TEST_DAYS)).date() |
|
|
| train_df_audit = pd.DataFrame({ |
| "date": [cutoff - timedelta(days=1)] * len(X_train), |
| "link": [f"train_{i}" for i in range(len(X_train))], |
| }) |
| test_df_audit = test_df[["date", "link"]].copy() |
|
|
| audit_leakage(X_train, X_test, train_df_audit, test_df_audit) |
|
|
| model = StockNewsModel() |
| results = model.train_and_evaluate(X_train, y_dir, y_hh, y_ret, X_test, test_df) |
|
|
| out_model = f'{TICKER.replace("^", "")}_model.pkl' |
| model.save(out_model) |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|