# train.py import asyncio from datetime import datetime, timedelta import pandas as pd from features import DataPipeline from model import StockNewsModel def audit_leakage(X_train, X_test, train_df, test_df): print(f"\n{'=' * 60}") print(" DATA LEAKAGE AUDIT") print(f"{'=' * 60}") train_max = train_df["date"].max() test_min = test_df["date"].min() ok1 = train_max < test_min print(f" [{'OK' if ok1 else 'FAIL'}] Train max date ({train_max}) < Test min date ({test_min})") train_links = set(train_df["link"]) test_links = set(test_df["link"]) overlap = train_links & test_links ok2 = len(overlap) == 0 print(f" [{'OK' if ok2 else 'FAIL'}] No shared articles (overlap={len(overlap)})") future_cols = [c for c in X_train.columns if "next" in c or "future" in c or "target" in c or "label" in c] ok3 = len(future_cols) == 0 print(f" [{'OK' if ok3 else 'FAIL'}] No future-looking feature names (found: {future_cols})") ok4 = X_train.shape[1] == X_test.shape[1] print(f" [{'OK' if ok4 else 'FAIL'}] Same feature count: train={X_train.shape[1]} test={X_test.shape[1]}") all_ok = ok1 and ok2 and ok3 and ok4 print(f"\n {'OK ALL CHECKS PASSED' if all_ok else 'FAIL LEAKAGE DETECTED'}") print(f"{'=' * 60}") return all_ok async def main(): TICKER = "^NSEI" TRAIN_DAYS = 180 TEST_DAYS = 14 print(f"\n{'#' * 60}") print(" LightGBM NEWS IMPACT MODEL - TRAINING") print(f" Ticker: {TICKER}") print(f" Train: {TRAIN_DAYS} days | Test: {TEST_DAYS} days") print(f"{'#' * 60}") pipeline = DataPipeline(TICKER, train_days=TRAIN_DAYS, test_days=TEST_DAYS) X_train, y_dir, y_hh, y_ret, X_test, test_df, price_df = await pipeline.build_dataset() now = datetime.now() cutoff = (now - timedelta(days=TEST_DAYS)).date() train_df_audit = pd.DataFrame({ "date": [cutoff - timedelta(days=1)] * len(X_train), "link": [f"train_{i}" for i in range(len(X_train))], }) test_df_audit = test_df[["date", "link"]].copy() audit_leakage(X_train, X_test, train_df_audit, test_df_audit) model = StockNewsModel() results = model.train_and_evaluate(X_train, y_dir, y_hh, y_ret, X_test, test_df) out_model = f'{TICKER.replace("^", "")}_model.pkl' model.save(out_model) if __name__ == "__main__": asyncio.run(main())