File size: 2,388 Bytes
d3ce5a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# train.py
import asyncio
from datetime import datetime, timedelta

import pandas as pd

from features import DataPipeline
from model import StockNewsModel


def audit_leakage(X_train, X_test, train_df, test_df):
    print(f"\n{'=' * 60}")
    print("  DATA LEAKAGE AUDIT")
    print(f"{'=' * 60}")

    train_max = train_df["date"].max()
    test_min = test_df["date"].min()
    ok1 = train_max < test_min
    print(f"  [{'OK' if ok1 else 'FAIL'}] Train max date ({train_max}) < Test min date ({test_min})")

    train_links = set(train_df["link"])
    test_links = set(test_df["link"])
    overlap = train_links & test_links
    ok2 = len(overlap) == 0
    print(f"  [{'OK' if ok2 else 'FAIL'}] No shared articles (overlap={len(overlap)})")

    future_cols = [c for c in X_train.columns if "next" in c or "future" in c or "target" in c or "label" in c]
    ok3 = len(future_cols) == 0
    print(f"  [{'OK' if ok3 else 'FAIL'}] No future-looking feature names (found: {future_cols})")

    ok4 = X_train.shape[1] == X_test.shape[1]
    print(f"  [{'OK' if ok4 else 'FAIL'}] Same feature count: train={X_train.shape[1]} test={X_test.shape[1]}")

    all_ok = ok1 and ok2 and ok3 and ok4
    print(f"\n  {'OK ALL CHECKS PASSED' if all_ok else 'FAIL LEAKAGE DETECTED'}")
    print(f"{'=' * 60}")
    return all_ok


async def main():
    TICKER = "^NSEI"
    TRAIN_DAYS = 180
    TEST_DAYS = 14

    print(f"\n{'#' * 60}")
    print("  LightGBM NEWS IMPACT MODEL - TRAINING")
    print(f"  Ticker: {TICKER}")
    print(f"  Train: {TRAIN_DAYS} days | Test: {TEST_DAYS} days")
    print(f"{'#' * 60}")

    pipeline = DataPipeline(TICKER, train_days=TRAIN_DAYS, test_days=TEST_DAYS)
    X_train, y_dir, y_hh, y_ret, X_test, test_df, price_df = await pipeline.build_dataset()

    now = datetime.now()
    cutoff = (now - timedelta(days=TEST_DAYS)).date()

    train_df_audit = pd.DataFrame({
        "date": [cutoff - timedelta(days=1)] * len(X_train),
        "link": [f"train_{i}" for i in range(len(X_train))],
    })
    test_df_audit = test_df[["date", "link"]].copy()

    audit_leakage(X_train, X_test, train_df_audit, test_df_audit)

    model = StockNewsModel()
    results = model.train_and_evaluate(X_train, y_dir, y_hh, y_ret, X_test, test_df)

    out_model = f'{TICKER.replace("^", "")}_model.pkl'
    model.save(out_model)


if __name__ == "__main__":
    asyncio.run(main())